# create functions and general ggplot vectors to streamline code and make it more reproducible
### General figure parameters
height_faceted <- 10
# Function to control the table output
kable2 <- function(x) {
datatable(x)
}
# Plot color palette
colors <- c(
"#FFBE98",
"#F05A7E",
"#125B9A",
"#0B8494"
)
# Set the dot color for performance metrics comparison
dot_color <- "red"
# Set global options for ggplot colors
thematic::thematic_rmd(
font = "auto",
qualitative = colors
)
# Set global ggplot2 theme
theme_set(theme_minimal())
The aim of the analysis was to find the best predictors of Women’s NBA players’ salary among their different gaming stats.
Player stats https://www.basketball-reference.com/
Salaries The WNBA players’ stats were scraped form: https://www.spotrac.com/wnba/rankings/
Variables imported as part of the players’ stats
| Abbreviation | Description |
|---|---|
| Pos | Position |
| G | Games |
| MP | Minutes Played |
| GS | Games Started |
| FG | Field Goals |
| FGA | Field Goal Attempts |
| FG% | Field Goal Percentage |
| 3P | 3-Point Field Goals |
| 3PA | 3-Point Field Goal Attempts |
| 3P% | 3-Point Field Goal Percentage |
| 2P | 2-Point Field Goals |
| 2PA | 2-point Field Goal Attempts |
| 2P% | 2-Point Field Goal Percentage |
| FT | Free Throws |
| FTA | Free Throw Attempts |
| FT% | Free Throw Percentage |
| ORB | Offensive Rebounds |
| TRB | Total Rebounds |
| AST | Assists |
| STL | Steals |
| BLK | Blocks |
| TOV | Turnovers |
| PF | Personal Fouls |
| PTS | Points |
| Abbreviation | Position |
|---|---|
| C | Center |
| C-F | Center-Forward |
| F | Forward |
| F-C | Forward-Center |
| F-G | Forward-Guard |
| G | Guard |
| G-F | Forward-Center |
# Import salary dataset
salaries <- read_csv("WNBA - salaries.csv")
salaries %>%
glimpse()
Rows: 596
Columns: 3
$ Player <chr> "Angel McCoughtry", "Skylar Diggins", "Allie Quigley", "Chiney …
$ Salary <chr> "$115,500", "$115,500", "$115,500", "$115,500", "$115,500", "$1…
$ Year <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 201…
sal_dim <- dim(salaries)
There are 596 observations in the salary dataset with 3 variables.
# Check if there are any NAs in the salaries dataset
salaries_na <- salaries %>%
is.na() %>%
colSums()
There are 0 missing data points in total.
duplicates <- salaries %>%
duplicated() %>%
sum()
There are 1 duplicates in the salary dataset.
salaries[duplicated(salaries),]
# A tibble: 1 × 3
Player Salary Year
<chr> <chr> <dbl>
1 Sylvia Fowles $113,360 2019
The only duplicate concers Sylvia Fowles. According to spotrac she extended her contract averaging 115k dollars a year. As such this duplicate seems to be a random error and is safe to be removed.
# Remove duplicates
salaries <- salaries %>%
rename(year = Year) %>%
unique()
# Format salaries to double
salaries <- salaries %>%
mutate(
Salary = str_replace_all(Salary, pattern = "\\$|,", "") %>% as.double()
)
Our dependent variable is salary. As we have data from several years, we would like to adjust the values for inflation. This way they will reflect a more “true” value across years.
salaries %>%
count(year) %>%
kable2()
We have salary data for years 2018 - 2023.
# Import cpi data
cpi <- read_csv("CPI.csv")
# Extract and modify part of cpi data to match it with the salary data
cpi2 <- cpi %>%
filter(
Measure == "Index",
Country == "United States",
Subject == "CPI: 01-12 - All items",
Time %in% c(as.character(2018:2022), str_c("Q", 1:3, "-2023"))
) %>%
select(Time, Value) %>%
mutate(Time = str_replace(Time, "Q[1-3]-", "")) %>%
group_by(Time) %>%
summarise(
CPI = mean(Value) # We take the mean of the quaterly 2023 data
) %>%
mutate(year = as.double(Time)) %>%
select(-Time)
Here is the formula for the inflation adjustment factor:
\[ Inflation\; Adjustment\; Factor = \frac{CPI \; in\; Base\; Year}{CPI \; in\; Current\; Year} \]
We choose 2023 as the base year. Therefore, we will adjust the salaries from previous years (2018 to 2022) to reflect their value in 2023 dollars.
# Extract the CPI of the reference year to which the value of salaries will be adjusted
CPI_base <- cpi2 %>%
filter(year == 2023) %>%
pull(CPI)
# Add the cpi information to salaries
salaries <- salaries %>%
left_join(cpi2) %>%
mutate(
iaf = CPI_base / CPI,
salary_adj = Salary * iaf
) %>%
select(-c(CPI, iaf))
# Extract a single player that played in most years
player_example <- salaries %>%
count(Player) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(Player)
salaries %>%
select(Player, year, Salary, salary_adj) %>%
filter(Player == player_example) %>%
kable2()
As expected, the salaries for all years except for the current (base) year increased, as to reflect their value in 2023 dollars.
# Import codebook
codebook <- read_excel("codebook.xlsx") %>%
select(-1)
codebook <- codebook %>%
arrange(variable)
# view(codebook)
# List player stats datasets for all years
stats_files <- list.files("players stats")
# Vector for import
stats_import <- str_c("players stats/", stats_files)
# Extract year
wnba_year <- str_extract(stats_files, pattern = "[0-9]+") %>%
as.double()
# Import all files
stats_imported <- stats_import %>%
map(read_excel)
# Check compatibility of stats datasets variable names across years
stats_imported %>%
map(names) %>%
reduce(setdiff)
character(0)
There are no differences in names of the datasets when compared to one another.
# Check compatibility of stats datasets number of variables across years
stats_imported %>%
map_dbl(length) %>%
unique()
[1] 28
All datasets have the same number of variables.
# Combine all WNBA stats datasets into one
wnba_stats <- stats_imported %>%
map2(
wnba_year,
~mutate(.x, year = .y)
) %>%
reduce(add_row)
glimpse(wnba_stats)
Rows: 1,238
Columns: 29
$ Player <chr> "Natalie Achonwa", "Danielle Adams", "Matee Ajavon", "Evelyn Ak…
$ Team <chr> "IND", "CON", "ATL", "DAL", "SAS", "NYL", "NYL", "NYL", "MIN", …
$ Pos <chr> "F", "F-C", "G", "F", "C", "G", "G", "G", "G-F", "G", "G-F", "G…
$ G...4 <dbl> 34, 19, 31, 15, 34, 28, 33, 2, 32, 30, 34, 31, 30, 2, 34, 30, 5…
$ MP...5 <dbl> 623, 83, 250, 61, 522, 376, 304, 3, 886, 283, 1048, 735, 901, 3…
$ G...6 <dbl> 34, 19, 31, 15, 34, 28, 33, 2, 32, 30, 34, 31, 30, 2, 34, 30, 5…
$ GS <dbl> 17, 0, 2, 0, 10, 0, 0, 0, 32, 0, 34, 9, 30, 2, 33, 30, 0, 0, 14…
$ MP...8 <dbl> 623, 83, 250, 61, 522, 376, 304, 3, 886, 283, 1048, 735, 901, 3…
$ FG <dbl> 98, 16, 22, 4, 96, 23, 38, 1, 148, 33, 96, 100, 117, 9, 139, 10…
$ FGA <dbl> 176, 45, 76, 16, 165, 62, 101, 2, 295, 101, 193, 263, 274, 15, …
$ `FG%` <chr> ".557", ".356", ".289", ".250", ".582", ".371", ".376", ".500",…
$ `3P` <dbl> 0, 12, 0, 0, 0, 0, 15, 0, 19, 16, 6, 22, 59, 0, 8, 23, 1, 0, 9,…
$ `3PA` <dbl> 0, 31, 4, 0, 0, 13, 44, 0, 44, 57, 19, 79, 150, 1, 21, 66, 3, 2…
$ `3P%` <chr> NA, ".387", ".000", NA, NA, ".000", ".341", NA, ".432", ".281",…
$ `2P` <dbl> 98, 4, 22, 4, 96, 23, 23, 1, 129, 17, 90, 78, 58, 9, 131, 86, 8…
$ `2PA` <dbl> 176, 14, 72, 16, 165, 49, 57, 2, 251, 44, 174, 184, 124, 14, 27…
$ `2P%` <chr> ".557", ".286", ".306", ".250", ".582", ".469", ".404", ".500",…
$ FT <dbl> 45, 5, 31, 5, 20, 7, 2, 0, 33, 16, 37, 38, 24, 8, 37, 64, 3, 37…
$ FTA <dbl> 59, 5, 39, 6, 22, 10, 6, 0, 38, 20, 46, 47, 31, 11, 47, 90, 5, …
$ `FT%` <chr> ".763", "1.000", ".795", ".833", ".909", ".700", ".333", NA, ".…
$ ORB <dbl> 37, 7, 9, 7, 45, 10, 13, 1, 13, 2, 21, 4, 7, 3, 54, 51, 4, 43, …
$ TRB <dbl> 127, 11, 38, 9, 107, 42, 67, 2, 94, 34, 113, 45, 60, 8, 215, 20…
$ AST <dbl> 25, 4, 30, 0, 17, 62, 17, 1, 127, 22, 76, 89, 199, 5, 53, 44, 0…
$ STL <dbl> 14, 4, 11, 1, 15, 16, 12, 0, 18, 4, 71, 26, 36, 3, 19, 33, 5, 5…
$ BLK <dbl> 17, 4, 0, 3, 17, 1, 13, 0, 3, 0, 17, 4, 3, 0, 59, 13, 2, 5, 8, …
$ TOV <dbl> 31, 7, 28, 5, 32, 19, 18, 1, 45, 12, 45, 27, 62, 2, 62, 48, 3, …
$ PF <dbl> 76, 24, 31, 6, 68, 24, 35, 0, 32, 28, 93, 68, 38, 5, 73, 69, 11…
$ PTS <dbl> 241, 49, 75, 13, 212, 53, 93, 2, 348, 98, 235, 260, 317, 26, 32…
$ year <dbl> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 201…
wnba_dim <- dim(wnba_stats)
The compiled WNBA players’ stats dataset consists of 1238 observations and 29 variables.
# Inspect character vectors
wnba_stats %>%
select(where(is.character)) %>%
map(unique) %>%
map(~.[1:15])
$Player
[1] "Natalie Achonwa" "Danielle Adams" "Matee Ajavon" "Evelyn Akhator"
[5] "Kayla Alexander" "Lindsay Allen" "Rebecca Allen" "Ameryst Alston"
[9] "Seimone Augustus" "Rachel Banham" "Alana Beard" "Alex Bentley"
[13] "Sue Bird" "Brittany Boyd" "Jessica Breland"
$Team
[1] "IND" "CON" "ATL" "DAL" "SAS" "NYL" "MIN" "LAS" "SEA" "CHI" "PHO" "WAS"
[13] "TOT" "LVA" NA
$Pos
[1] "F" "F-C" "G" "C" "G-F" "F-G" "C-F" NA NA NA NA NA
[13] NA NA NA
$`FG%`
[1] ".557" ".356" ".289" ".250" ".582" ".371" ".376" ".500" ".502" ".327"
[11] ".497" ".380" ".427" ".600" ".476"
$`3P%`
[1] NA ".387" ".000" ".341" ".432" ".281" ".316" ".278" ".393" ".381"
[11] ".348" ".333" ".250" ".237" ".292"
$`2P%`
[1] ".557" ".286" ".306" ".250" ".582" ".469" ".404" ".500" ".514" ".386"
[11] ".517" ".424" ".468" ".643" ".483"
$`FT%`
[1] ".763" "1.000" ".795" ".833" ".909" ".700" ".333" NA ".868"
[10] ".800" ".804" ".809" ".774" ".727" ".787"
There are 4 vectors representing proportion values that are incorrectly read as strings and therefore should be coerced to doubles.
# Correcting vector types for
wnba_stats <- wnba_stats %>%
mutate(
across(ends_with("%"), ~as.double(.))
)
# Check unique values for character vectors
wnba_stats %>%
select(where(is.character)) %>%
select(-Player) %>%
map(unique) %>%
enframe(name = "Variable", value = "Unique values") %>%
mutate(
`Uniques number` = map(`Unique values`, length),
`Unique values` = map_chr(`Unique values`, str_c, collapse = ", ")
) %>%
kable2()
There are two character variables, Team and Position, which contain 14 and 7 unique values (categories) respectively.
# Check duplicates
wnba_duplicated <- wnba_stats %>%
duplicated() %>%
sum()
There were 0 duplicated rows in the WNBA dataset.
wnba_stats %>%
get_summary_stats() %>%
kable2()
Two pairs of variables have exactly the same values for all descriptive statistics, which suggests they are identical.
# Identify duplicated variables
duplicated_vars <- wnba_stats %>%
as.list.data.frame() %>%
duplicated()
# Show duplicated variables
wnba_stats[, duplicated_vars] %>%
kable2()
Two variables are duplicated.
# Remove duplicated variables
wnba_stats <- wnba_stats[, !duplicated_vars]
# Check if there are duplicated observations
wnba_stats %>%
duplicated() %>%
sum()
[1] 0
There are no duplicated rows in the WNBA stats dataset.
# Check if there are some players who have more than one row of stats for a given year
players_year_dup <- wnba_stats %>%
select(Player, year) %>%
duplicated() %>%
sum()
There are 145 cases of additional rows of stats for a given player in a given year.
# Identify all cases of in which in a given year a given player has more than one row of stats
player_duplicates <- wnba_stats %>%
count(Player, year) %>%
filter(n >= 2) %>%
arrange(desc(n))
player_duplicates %>%
kable2()
We have 69 cases in which a given player in a given year had more than one row of game stats.
# Check if this holds true if we add team to the calculation
wnba_stats %>%
count(Player, Team, year) %>%
filter(n >= 2)
# A tibble: 0 × 4
# ℹ 4 variables: Player <chr>, Team <chr>, year <dbl>, n <int>
The identified repetition disappear when we include the team variable. This suggests that some of the players changed their team a given number of times in a given year.
# Create a filter vector for repetitions
player_filtering <- player_duplicates %>%
mutate(repeated_filter = str_c(Player, year)) %>%
pull(repeated_filter)
# Show the different teams identified players played in given years
wnba_stats %>%
mutate(repeated_filter = str_c(Player, year)) %>%
filter(repeated_filter %in% player_filtering) %>%
select(Player, Team, year) %>%
arrange(Player, year) %>%
kable2()
For instance, a player named AD Durr played on three different teams during 2022. This raises a methodological question.
Given that the salaries dataset shows salaries for a given year for a given player, to combine salaries data with wnba stats, each player should have one summary set of stats for a given year.
A possible solution is to sum the stats over a given year for players who played on more than one team during this year.
The only limitation would be the position variable, as it is a categorical variable.
# Check if among the players that changed their teams in a given year
wnba_stats %>%
count(Player, Pos, year) %>%
filter(n >= 2) %>%
count(Player, year) %>%
filter(n >= 2)
# A tibble: 0 × 3
# ℹ 3 variables: Player <chr>, year <dbl>, n <int>
However, each player that changed team in a given year nevertheless remained playing on the same position. We can therefore summarize their results over given years and add them back to the dataset. We will have to exclude the percentage variables during the summarization and calculate them once the summarized stats of the identified players are joined with the remaining stats of player stats in a given year.
# Obtain summary stats for each player in a given year
wnba_summarized <- wnba_stats %>%
group_by(Player, year) %>%
select(-ends_with("%")) %>% # Exclude percentage variables
summarise(
across(where(is.double), ~sum(.))
) %>%
ungroup()
# Extract information about position
wnba_position <- wnba_stats %>%
group_by(Player, year) %>%
select(Pos) %>%
slice(1) %>%
ungroup()
# Join position and main stats datasets
wnba_stats2 <- wnba_position %>%
left_join(wnba_summarized)
# create vector for filtering over pairs of successes and attemps
pct_filter <- wnba_stats %>%
select(ends_with("%") )%>%
names() %>%
str_replace("%", "")
# Vector to loop over
wnba_pct <- list()
# Obtain percentages after data modification
for (var in pct_filter) {
# Select a pair of attemps and successes
vars <- wnba_stats2 %>%
select(starts_with(var)) %>%
select(sort(names(.)))
# Calculate percentage of successes over attemps
pct <- vars[, 1] / vars[, 2]
wnba_pct[[var]] <- pct
}
# Join all pct variables into one column
wnba_pct <- wnba_pct %>%
reduce(add_column) %>%
rename_with(~str_c(., "%"))
# Join main stats and pct datasets
wnba_stats3 <- wnba_stats2 %>%
add_column(wnba_pct) %>%
rename(
Games = G...4,
Minutes = MP...5
) # Fix names of specific variables
# Check if the code gave correct results
wnba_stats3 %>%
select(starts_with(pct_filter[1])) %>%
kable2()
wnba_missing <- wnba_stats3 %>%
is.na() %>%
colSums() %>%
sort(decreasing = T) %>%
keep(~. != 0)
wnba_missing %>%
enframe(name = "variable", value = "missing_sum") %>%
kable2()
Missing data were observed only for the variables measuring percentage of successes in a given performance category.
Let’s inspect observations in terms of the absolute success and overall attempts separately for each category only for the missing data.
## Inspect all missing observations ##
# Empty list for running a loop
wnba_NAs <- list()
# Obtain separate datasets with missing variables for each set of variables
for (i in seq_along(pct_filter)) {
wnba_NAs[[i]] <- wnba_stats3 %>%
select(starts_with(pct_filter[i])) %>%
filter(if_any(everything(), ~is.na(.))) %>%
select(sort(names(.))) %>%
set_names(c("Absolute successes", "Pct successes", "Total attemps")) %>%
mutate(category = pct_filter[[i]])
}
# Show summary of instances where the NA's for pct variables occur
wnba_NAs %>%
map(unique) %>%
reduce(add_row) %>%
select(category, everything()) %>%
knitr::kable()
| category | Absolute successes | Pct successes | Total attemps |
|---|---|---|---|
| FG | 0 | NaN | 0 |
| 3P | 0 | NaN | 0 |
| 2P | 0 | NaN | 0 |
| FT | 0 | NaN | 0 |
As we can see, missing data for percentage of success occurred only in situations when there were zero total attempts. This makes sense, because you can’t calculate a proportion of something to nothing.
The missing data for the performance metrics expressed in percentages raises another methodological problem. The missing data cannot be imputed given the mathematical nature of those variables. First, let’s inspect each set of performance metrics separately, to see how the three metrics in a given category are related to each other.
# Obtain separate data frames for each of the goal performance category sets
wnba_chunks <- pct_filter %>%
map(~select(wnba_stats3, starts_with(.))) %>%
map(drop_na)
# Obtain ggcorrplots for each sets
ggcorr_sets <- wnba_chunks %>%
map(cor) %>%
map(~ggcorrplot(., lab = T))
do.call(cowplot::plot_grid, ggcorr_sets)
There are near perfect linear associations between successes and attempts for each category of player’s performance. In case of the linear models, it would be necessary to remove either of the two for each pair, in order to avoid multicollinearity.
wnba_chunks %>%
walk(plot)
Interestingly, the relationship between the percentage of successful attempts and the absolute values of the successes and attempts is not linear.
The majority of the observations were located within a specific and Gaussian-like interval. Even though those are scatter plots and not histograms, it is interesting to see that the relationships between a given percentage metric and either of the two remaining performance metrics seem to follow a normal distribution. This suggests that the percentage metrics also carry information about the “norm” or the “average” skill with 2-Point Goals and Field Goals being located mostly between 30% and 65% and 3-Point Goals between 20% and 45%.
# Show min and max values of percentage metrics for players with at least 10 attempts
wnba_stats %>%
filter(if_all(c("2PA", "FTA", "3PA", "FGA"), ~. >= 10)) %>%
select(ends_with("%")) %>%
get_summary_stats() %>%
pivot_longer(c(min, max)) %>%
ggplot(aes(value, variable, group = variable)) +
geom_line(color = "black") +
geom_point() +
labs(
title = "Percentage metric values for players who had at least 10 goal attempts",
y = "",
x = "Percentage of successes"
) +
scale_x_continuous(label = scales::label_percent())
However, the percentage metrics suffer from a critical short-coming. Players below or above the respective “average” ranges were noticeably similar to one another, with values close to zero for both attempts and successes. Among players with more at least 10 attempts there were virtually none who had a higher success rate than 70%. A similar case can be made for the lower ranges for 2-Point and Field goals. This implies that having only a few to several attempts makes a player’s success percentage less objective as a performance measure, as one or two shots can determine whether a player has a 0% or 100% success rate.
The exception is for FT (free throws) for which majority of the observations had a success rate above 60% which was also associated with a higher variance on both absolute measures - attempts and successes. Given that a free throw is unhindered and always from the same position, it would be on average easier to succeed in a given attempt.
# Show summary stats for each sets when the proportion of successes is between 0.2 and 0.7
wnba_chunks %>%
map(
~filter(., .[[names(.)[[3]]]] <= 0.2 | .[[names(.)[[3]]]] >= 0.70)
) %>%
map(
get_summary_stats
) %>%
reduce(add_row) %>%
kable2()
This poses a methodological trade-off dilemma. On one hand, the percentage index, by showing the proportion of successes to all attempts, seems to capture better the effectiveness of a player. On the other hand, it seems to favor or overlook players who had just a few attempts in total, with either mostly successes or mostly misses.
It also raises the question what to do with NA
values.
# Move salaries one year back
salaries_back <- salaries %>%
mutate(year = year - 1)
Assuming a cause and effect relationship, we would expect that salary would be a consequence of player’s prior overall performance. Therefore, before combining datasets, we should move salaries one year back to associate it with stats from the year prior to the year of salary.
# Combine datasets
wnba_combined <- salaries_back %>%
left_join(wnba_stats3)
# Select only complete observations
wnba_combined2 <- wnba_combined %>%
filter(!if_all(-names(salaries), is.na)) %>%
mutate(
year = factor(year),
Pos = factor(Pos)
)
# Check number of missing values of joining the two datasets
wnba_combined %>%
is.na() %>%
colSums() %>%
sort(decreasing = T) %>%
enframe(name = "variable", value = "Number of missing values") %>%
kable2()
Joining the salary and the WNBA stats datasets resulted in 135 observations that lacked data in all performance stats.
# Remove the unadjusted salary and player name
wnba_complete <- wnba_combined2 %>%
select(-c(Salary, Player)) %>%
select( salary_adj, everything())
# Adjust variable names to make them comptabile with all modelling functions
wnba_complete %>%
is.na() %>%
colSums() %>%
keep(~. != 0) %>%
enframe(name = "variable", value = "Number of missing values") %>%
kable2()
After removing observations with no performance stats, we are still left with a substantial number of observations that have zero attempts in at least one of the four performance categories.
wnba_complete %>%
get_summary_stats() %>%
kable2()
The final dataset consisted of 460 records and 26 variables.
# Pivot the full dataset stats
pivoted_full_stats <- wnba_stats3 %>%
select(where(is.double), -year) %>%
pivot_longer(everything()) %>%
mutate(stats_data = "full")
# Pivot the combined dataset
pivoted_everything <- wnba_complete %>%
pivot_longer(-c(Pos, year))
# Join old and new pivoted datasets
pivoted_combied <- pivoted_everything %>%
select(-c(year, Pos)) %>%
mutate(stats_data = "reduced") %>%
add_row(pivoted_full_stats)
# Compare old and new distributions
pivoted_combied %>%
ggplot(aes(value, color = stats_data)) +
geom_density() +
facet_wrap(~name, scales = "free", ncol = 4) +
theme(legend.position = "top")
There were moderate differences between the distributions of the full
and reduced datasets for specific variables.
The most noticeable differences concerned Field Goals and Field Goal Attempts, 2-point goals and 2-point goal attempts, Points, Turnovers, Steals, and Minutes, showing a proportionally smaller number of lower values in the dataset that had to be reduced due to lack of the corresponding salary data.
This suggests that the salary information available on the website from which it was scraped included salaries primarily for better-performing players.
The analyzed sample, therefore, is not representative of the general population and, as such, will reduce the external validity of the following statistical modeling.
pivoted_everything %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap(~name, scales = "free", ncol = 4)
Almost all performance variables, including 2-points and 3-points goals and attempts, field goals and field goals attempts, free throws and free throws attempts, assists, blocks, steals, and total and offensive rebounds, had a right-skewed distribution, meaning that as the performance was getting better, the number of players was decreasing. This more or less follows the Pareto Distribution which is to be expected in variables that represent human skill.
In contrast, Games had a left-skewed distribution with most players playing in around 35 games in a given year, while Games Started was a bimodal distribution, with a substantial number of players not forming the starting lineup in a given year.
Personal Fouls, with the exception of several outliers on the higher end, approximated a normal distribution with most players engaging in between 25 and 75 fouls in a given year.
Interestingly, the distribution of salary was bimodal with the two peaks being strikingly higher than the rest of the values which had a relatively high spread, from 50k to 250k. This suggests that the salary doesn’t reflect the performance stats directly and that other, perhaps not available variables, might play a role.
# Check the more precise values of salaries distribution
wnba_complete %>%
mutate(
salary_cat = cut(salary_adj, breaks = 30)
) %>%
count(salary_cat) %>%
arrange(desc(n))
# A tibble: 30 × 2
salary_cat n
<fct> <int>
1 (1.31e+05,1.38e+05] 64
2 (7.01e+04,7.69e+04] 53
3 (7.69e+04,8.36e+04] 46
4 (1.38e+05,1.44e+05] 32
5 (6.34e+04,7.01e+04] 22
6 (1.17e+05,1.24e+05] 20
7 (2.12e+05,2.19e+05] 20
8 (1.24e+05,1.31e+05] 19
9 (1.98e+05,2.05e+05] 18
10 (9.71e+04,1.04e+05] 15
# ℹ 20 more rows
Most of the salaries were between 131k and 144k or 70k and 84k.
# Create a function for moving the year variable back to its original date
year_move <- function(x, factor = TRUE) {
# Move the variable one value up
year <- as.double(as.character(x)) + 1
# Turn the variable into a factor
if (factor) {year <- as.factor(year)}
return(year)
}
wnba_complete %>%
mutate(year = year_move(year)) %>% # Move salaries back to their true date
ggplot(aes(salary_adj)) +
geom_histogram() +
facet_wrap(~year)
Until 2020, none of the players earned more than 155k dollars. Since
then, the salaries of a noticeable number of players have risen to
between 155k and 255k dollars. However, the number of players with
salaries at the lower end (around 75k - 80k) has also grown, making them
the majority.
pivoted_everything %>%
ggplot(aes(value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free")
# Count player position numbers
positions <- wnba_complete %>%
freq_table(Pos)
positions %>%
ggplot(aes(Pos, prop)) +
geom_bar(stat = "identity") +
geom_text(aes(label = str_c("n = ", n), vjust = -0.5))
Majority of the players played either on the G or F positions. A still substantial number of players played on the C position. There were few players who played on a mixed position in a given year.
# Obtain means, medians, and percentage change of both, for salaries
salaries_stats <- wnba_combined2 %>%
select(year, contains("salary")) %>%
group_by(year) %>%
get_summary_stats() %>%
select(year, variable, mean, median) %>%
group_by(variable) %>%
mutate(
mean_change = (mean - lag(mean)) / lag(mean),
median_change = (median - lag(median)) / lag(median),
) %>%
mutate(across(c(mean_change, median_change), ~replace_na(., 0))) %>%
ungroup() %>%
mutate(year = year_move(year))
# Plot absolute changes in salaries
salaries_stats %>%
pivot_longer(c(mean, median)) %>%
ggplot(aes(year, value, color = variable, group = variable)) +
geom_point() +
geom_line() +
facet_wrap(~name)
The absolute change in salaries over the years is higher for salaries before the adjustment for inflation, which is especially visible since 2020. Interestingly, the mean absolute change from 2021 to 2022 was negative for the adjusted salary.
# Plot percentage changes for both statistics
salaries_stats %>%
pivot_longer(c(mean_change, median_change)) %>%
ggplot(aes(year, value, color = variable, group = variable)) +
geom_point() +
geom_line() +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~name)
The difference in the rate of change between adjusted and unadjusted values of salaries is even more clear when presented in the percentage change. However, even after the adjustment, we can see that the salaries of WNBA players grew, with an exception around 2021. This could explained by the difficulties in organizing matches and gathering larger crowds that the sport organizations faced due to COVID-19 pandemic.
wnba_complete %>%
ggplot(aes(year_move(year), salary_adj)) +
geom_boxplot()
Starting with 2020, a quarter of the players were earning more than $150,000 after adjusting for inflation, which was the maximum salary before that year.
Since 2021, the spread of the salaries has appeared to stabilize.
# Obtain position frequencies to place them on the boxplot
positions_median <- wnba_complete %>%
group_by(Pos) %>%
get_summary_stats(salary_adj, type = "median") %>%
mutate(
salary_adj = median - 6000,
n = str_c("n = ", n)
)
wnba_complete %>%
ggplot(aes(Pos, salary_adj)) +
geom_boxplot() +
geom_text(aes(label = n), data = positions_median)
F-C position had the highest proportion of high-earning players, with half earning more than $200,000. This was closely followed by the G-F position, where salaries were slightly smaller. However, in both cases, the sample of players was small and, therefore, more prone to random fluctuations, making their estimates less reliable.
There were no substantial differences between the three unmixed positions (C, F, and G). Given that only these had relatively large sample sizes, this suggests that the position does not have a significant effect on the players’ earnings.
# Pivot data for the
pivoted_Onsalary <- wnba_complete %>%
select(where(is.double)) %>%
pivot_longer(-salary_adj)
pivoted_Onsalary %>%
ggplot(aes(value, salary_adj)) +
geom_hex() +
geom_smooth(se = F, alpha = 1/3, size = 0.75, color = "orange") +
scale_fill_viridis_c(option = "C", direction = 1) +
facet_wrap(~name, scales = "free", ncol = 4) +
theme(legend.position = "top")
Higher number of goals, regardless of the category, were associated with higher salaries, until reaching a level after which further increases in goals did not seem to have an impact. The exception was the 3-Points Goals, where, on average, each additional goal seemed to contribute to the salary.
A similar trend was observed for less direct performance metrics, like number of Assists, Blocks, Steals, and Turnovers. In some cases there was a reverse at the higher values of a predictor with a small drop in salary, but this was due to just a few outliers and as a marker of the general relationship can be disregarded.
Overall, the associations between salary and different performance stats seem weak. Only a few could be roughly approximated using a linear model (e.g. 3-Points Goals and number of Assists). This suggests that the relationship between a player’s salary and her contribution to games is complex, and using non-linear modelling, like one that relies on decision trees, would be better at capturing it.
library(ggcorrplot)
wnba_complete %>%
select(-c(salary_adj, year, Pos)) %>%
cor(use = "pairwise.complete.obs", method = "spearman") %>%
ggcorrplot(
type = "upper",
insig = "blank"
)
The majority of the continuous predictors were strongly and positively correlated with each other.
However, one performance statistic stood out - 3-point goals, for the most part, was either weakly or moderately correlated with the remaining predictors and with one (Offensive Rebounds), it was even negatively associated.
Interestingly, the goal metrics, when expressed as a proportion of successes to attempts, had mostly weak associations and sometimes negative ones with the rest of the predictors. This might be explained by their tendency to distort the true performance of the players who made only a few attempts in total.
# Obtain a data frame with correlations between predictors
cor_predictors <- wnba_complete %>%
select(-c(salary_adj, year, Pos)) %>%
cor_test(method = "pearson") %>%
filter(cor != 1) %>%
arrange(desc(abs(cor))) %>%
mutate(across(where(is.numeric), ~round(., 3))) %>%
select(-c(statistic, method))
cor_predictors %>%
kable2()
There are a substantial number of highly correlated predictors. First and foremost, we see near perfect correlations between successes and attempts for each performance category pair. Similarly, there are very strong to near perfect correlations between number of points and some of the performance metrics, specifically, 2-point field goals, field goals, and free throws.
# Filter through only the goal performance metrics
cor_predictors %>%
filter(var1 %in% pct_filter, var2 %in% pct_filter ) %>%
kable2()
Field goals, 2-point goals, and free throws were very strongly correlated with one another.
# Obtain the number of strong correlations for each predictor separately
cor_predictors %>%
filter(abs(cor) >= 0.8) %>%
count(var1, name = "High correlations count") %>%
arrange(desc(`High correlations count`)) %>%
kable2()
Field Goals, Field Goals Attempts and Points were the predictors with the highest count of strong correlations (|cor| > 0.8), posing the highest risk for multicollinearity when fitting linear models.
plan(multisession)
# Compare Positions in terms of continuous predictors using ANOVA
pos_anova <- wnba_complete %>%
select(-c(year, salary_adj)) %>%
pivot_longer(-Pos) %>%
nest(.by = name) %>%
mutate(
future_map_dfr(data, ~as_tibble(anova_test(., value ~ Pos))),
significant = p <= 0.05
)
plan(sequential)
pos_anova %>%
ggplot(aes(ges, fct_reorder(name, ges), color = significant, alpha = significant)) +
geom_point() +
geom_hline(
yintercept = sum(!pos_anova$significant) + 0.5,
linetype = "dashed"
) +
labs(x = "Effect size (eta-squared)", y = "") +
scale_alpha_discrete(range = c(0.4, 1))
There were 13 13 continuous predictors which were significantly affected by Position, out of which in 8 cases the effect could be deemed large.
The strongest differences were observed for Offensive Rebounds and Blocks which was closely followed by percentage of Field Goals, 3-Points Goal Attempts, Total Rebounds and 3-Points Goals.
There were no differences in non-performance stats, like Number of Games and Minutes played and, somewhat surprisingly, total number of Points and Field Goals.
# Extract names of the outcomes for which the p was significant
pos_sig <- pos_anova %>%
filter(p <= 0.05) %>%
pull(name)
# Plot the comparisons of Positions in terms of the significant variables
wnba_complete %>%
select(-c(salary_adj, year)) %>%
select(Pos, all_of(pos_sig)) %>%
pivot_longer(-Pos) %>%
left_join(pos_anova) %>%
ggplot(aes(Pos, value)) +
geom_boxplot() +
facet_wrap(~fct_reorder(name, ges, .desc = T), scales = "free", ncol = 3)
Center and Center-Forward positions were associated with higher numbers of Offensive Rebounds, Blocks and - to a smaller degree - Field Goal percentage, as well as lower numbers of 3-point Goals and Attempts in comparison to the remaining positions.
The highest numbers of assists were observed for the Forward-Center, Forward-Guard, and Guard positions. Forward-Guard and Guard also had the highest numbers of 3-point Goals. On the other hand, these two and also the Guard-Forward position, had the smallest numbers of Total Rebounds.
# Rename variables to make them compatible with all modelling functions
wnba_complete <- wnba_complete %>%
rename_with(~str_replace_all(., "2", "Two_")) %>%
rename_with(~str_replace_all(., "3", "Three_")) %>%
rename_with(~str_replace_all(., "%", "_pct")) %>%
mutate(Pos = str_replace(Pos, "-", "_") %>% factor())
# Turn factors into a dummy variable format
model.matrix(~ ., data = wnba_complete) %>%
as_tibble() %>%
select(-1)
# A tibble: 417 × 35
salary_adj year2018 year2019 year2020 year2021 year2022 PosC_F PosF PosF_C
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 139775. 0 0 0 0 0 0 0 0
2 139775. 0 0 0 0 0 0 0 0
3 139775. 0 0 0 0 0 0 0 0
4 139775. 0 0 0 0 0 0 0 1
5 139775. 0 0 0 0 0 0 0 0
6 139775. 0 0 0 0 0 0 1 0
7 136750. 0 0 0 0 0 0 1 0
8 136750. 0 0 0 0 0 0 1 0
9 136750. 0 0 0 0 0 0 1 0
10 136750. 0 0 0 0 0 1 0 0
# ℹ 407 more rows
# ℹ 26 more variables: PosF_G <dbl>, PosG <dbl>, PosG_F <dbl>, Games <dbl>,
# Minutes <dbl>, GS <dbl>, FG <dbl>, FGA <dbl>, Three_P <dbl>,
# Three_PA <dbl>, Two_P <dbl>, Two_PA <dbl>, FT <dbl>, FTA <dbl>, ORB <dbl>,
# TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>, PTS <dbl>,
# FG_pct <dbl>, Three_P_pct <dbl>, Two_P_pct <dbl>, FT_pct <dbl>
# Extract values from parameters
log_transform <- params$log_transform # Add datasets with log transformed outcome
arrange_metric <- params$arrange_by # Performance metric by which the models should be arranged
# Arrange models by the metric of choice
arrange_own <- function(data) {
if (arrange_metric == "Rsquared") {
arrange(data, desc(Rsquared))
} else {arrange(data, pick(arrange_metric))}
}
# Create two versions of datasets
model_full <- wnba_complete %>%
mutate(across(ends_with("pct"), ~replace_na(., 0)))
model_reduced <- wnba_complete %>%
select(-ends_with("pct"))
model_datasets <- tibble(
dataset = c("full", "reduced"),
whole = list(model_full, model_reduced)
)
model_datasets <- model_datasets %>%
rename(original = whole ) %>%
mutate(log = map(original, function(x) {mutate(x, salary_adj = log(salary_adj))})) %>%
pivot_longer(-dataset, names_to = "transformed", values_to = "whole") %>%
mutate(
dataset = str_c(dataset, "_", transformed),
dataset = factor(dataset, levels = c(
"full_original",
"reduced_original",
"full_log",
"reduced_log"
))
) %>%
select(-transformed)
# Set seed
set.seed(123)
# Set up k-indexing
indices <- sample(1:2, nrow(wnba_complete), prob = c(0.8, 0.2), replace = T)
# Extract general formula
general_formula <- salary_adj ~ .
model_datasets <- model_datasets %>%
mutate(
train = map(whole, ~.[indices == 1,]),
test = map(whole, ~.[indices == 2,]),
train_dummy = map(train, ~as_tibble(model.matrix(~ ., .x)[,-1])),
test_dummy = map(test, ~as_tibble(model.matrix(~ ., .x)[,-1])),
x_train = map(train, ~model.matrix(general_formula, .x )[, -1]),
y_train = map(train, ~.x$salary_adj),
x_test = map(test, ~model.matrix(general_formula, .x )[, -1]),
y_test = map(test, ~.x$salary_adj),
)
# Extract model and its performance metrics from a caret object
extract_caret <- function(model) {
# Extract
model_stats <- model$results %>%
select(-ends_with("SD")) %>%
as_tibble() %>%
mutate(fit = list(model$finalModel))
# Return
return(model_stats)
}
cross_fit <- function(x_data, y_data, ..., tuning, method = "glmnet") {
# train cross-validation models
model <- train(
x_data, y_data, ...,
method = method,
tuneGrid = tuning,
trControl = trainControl(method = "cv", number = 10)
)
# Return model and its performance metrics
return(extract_caret(model))
}
Linear models will be fitted using a combination of forward and backward stepwise selections.
# Extract regsubsets function
regsubset_own <- function(data, nvmax, method = "forward") {
regsubsets(
data = data,
nvmax = nvmax,
x = general_formula,
method = method
)
}
# Function for predicting values from regsubset object
subset_lm <- function(regsubset, id, data) {
# Extract coefficients
xvars <- coef(regsubset, id = id) %>%
names() %>%
keep(~. != "(Intercept)")
# Create formula
form <- str_c("salary_adj ~ ", str_c(xvars, collapse = " + ")) %>%
as.formula()
# Set up the train control for 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Fit the linear model using the train function from the caret package
model <- train(form, data = data, method = "lm", trControl = train_control)
# Return model and its performance stats
return(extract_caret(model))
}
# Set seed
set.seed(123)
regsubsets_models <- model_datasets %>%
select(dataset, train, train_dummy, test_dummy) %>%
mutate(
id = map(train, ~1:(ncol(.) - 1)),
predictors_total = map_dbl(train, ncol),
fit_forward = map2(train, predictors_total, regsubset_own,method = "forward"),
fit_backward = map2(train, predictors_total, regsubset_own,method = "backward")
)
Reordering variables and trying again:
Reordering variables and trying again:
Reordering variables and trying again:
Reordering variables and trying again:
Reordering variables and trying again:
Reordering variables and trying again:
Reordering variables and trying again:
Reordering variables and trying again:
# Fit all best selected models
regsubsets_models2 <- regsubsets_models %>%
select(-c(train, test_dummy, predictors_total)) %>%
pivot_longer(c(fit_forward, fit_backward), names_to = "method", values_to = "reg_model") %>%
unnest(id) %>%
mutate(
lm = pmap(list(reg_model, id, train_dummy), subset_lm),
) %>%
unnest(lm) %>%
mutate(
across(where(is.double), ~round(., 3)),
predictors = id - 1
) %>%
arrange_own()
# Select the best models
best_lm <- regsubsets_models2 %>%
group_by(dataset) %>%
slice(1) %>%
select(dataset, fit) %>%
ungroup()
# 10 best
regsubsets_best10 <- regsubsets_models2 %>%
select(dataset, predictors, method, RMSE, MAE, Rsquared) %>%
slice(1:10)
regsubsets_best10 %>%
kable2()
Cross-validation revealed that the best 10 linear models explained between 0.359 0.371 proportion of the total variance, having between 11 and 21 predictors.
# Plot scatterplots of number of variables against R-squared
regsubsets_models2 %>%
group_by(dataset) %>%
mutate(best = Rsquared == max(Rsquared)) %>%
ggplot(aes(predictors, Rsquared, color = dataset, alpha = best)) +
geom_point() +
facet_wrap(~dataset)
The explanatory power of the linear models consistently increases up to
around 10 predictors, after which the relationship plateaus.
regsubsets_models2 %>%
select(dataset, predictors, RMSE, MAE) %>%
pivot_longer(c( RMSE, MAE), names_to = "Metric", values_to = "Performance") %>%
group_by(dataset, Metric) %>%
mutate(best = Performance == min(Performance)) %>%
ggplot(aes(predictors, Performance, color = dataset, alpha = best)) +
geom_point() +
facet_grid(dataset ~ Metric, scales = "free")
# Obtain coefficients, residuals and diagnostics
best_lm2 <- best_lm %>%
mutate(
coefs = map(fit, ~lm.beta::lm.beta(.) %>% tidy()),
augmented = map(fit, augment),
vif = map(fit, vif),
vif_names = map(vif, names)
)
lm_coefs <- best_lm2 %>%
select(dataset, coefs) %>%
unnest() %>%
group_by(dataset) %>%
arrange(desc(std_estimate)) %>%
ungroup() %>%
mutate(across(where(is.double), ~round(., 3))) %>%
mutate(
std_estimate2 = if_else(
p.value <= 0.05,
str_c(std_estimate, "*"),
as.character(std_estimate)
),
significant = p.value <= 0.05
)
# Plot standardized coefficients of the predictors for the four best linear models together
lm_coefs %>%
ggplot(aes(std_estimate, term, color = dataset, alpha = significant)) +
geom_point(position = position_jitter(width = 0, height = 0.3)) +
coord_cartesian(xlim = c(
-abs(max(lm_coefs$std_estimate, na.rm = T)),
abs(max(lm_coefs$std_estimate, na.rm = T)))
) +
geom_vline(xintercept = 0)
Regardless of the model, field goals, year, and number of games were the strongest predictors of a WNBA player’s salary.
The more field goals a player had, the better her salary was. Interestingly, the reverse was true for the number of games. Salaries were visibly higher from 2019 to 2022 than in 2017 and 2018 with a stabilization being achieved beginning with 2019.
Additionally, the number of minutes on the field was an important predictor in models with the log-transformed outcome.
# Show standardized coefficients for all models in a table
lm_coefs %>%
select(dataset, term, std_estimate2) %>%
pivot_wider(names_from = dataset, values_from = std_estimate2) %>%
kable2()
best_lm3 <- best_lm2 %>%
unnest(augmented) %>%
select(-fit)
# Plot histograms
best_lm3 %>%
ggplot(aes(.resid)) +
geom_histogram() +
facet_wrap(~dataset, scales = "free")
# Plot Q-Q plots
best_lm3 %>%
ggplot(aes(sample = .resid)) +
geom_qq() +
geom_qq_line() +
facet_wrap(~dataset, scales = "free")
Judging by histograms and Q-Q plots, a slight deviation from normality can be observed, with more observations at the tails. However, this slight deviation should not significantly affect the estimation of the coefficients.”
best_lm3 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
geom_smooth(se = F) +
facet_wrap(~dataset, scales = "free")
The models fitted to the untransformed data suffered from heteroscedasticity, with a lower error variance at the smaller predicted values. This was somewhat, but not entirely, corrected in the models with the log-transformed outcome. Moreover, all models displayed non-linearity.
# Plot VIF
best_lm2 %>%
select(dataset, vif, vif_names) %>%
unnest() %>%
ggplot(aes(vif, vif_names, color = dataset)) +
geom_point(position = position_jitter(height = 0.3))
Each of the four best models reported extremely high correlations for the same two predictors: - FG (Field goals) - FGA (Field goals attempts)
This is in line with the correlation analyses, which revealed that the two predictors had the highest count of correlation above |0.8| with the remaining predictors.
Additionally, models fitted to the datasets with the logarithmitized outcome, reported a high correlation for Minutes, while models fitted to the datasets with the untransformed outcome had a high correlation for TRB (Total Rebounds).
# Add conditional variables indicating influential and outlier values
best_lm3 <- best_lm3 %>%
mutate(Cook_high = if_else(.cooksd >= 1, T, F))
# Plot cook's distance values
best_lm3 %>%
ggplot(aes(dataset, .cooksd)) +
geom_jitter()
No influential values were observed.
# Plot standardized residuals
best_lm3 %>%
ggplot(aes(dataset, .std.resid)) +
geom_jitter()
No observations with a standardized residual greater than |x| > 3 were found.
Even though the best linear models explained a substantial proportion of the outcome’s variance, they suffered from moderate heteroscedasticity and, even more importantly, they displayed non-linearity and some of their predictors had a very high VIF. All this means that the estimated coefficients are not reliable making the models less trust-worthy.
# Set seed
set.seed(123)
# Create search grid for lambda parameter in lasso
tune_lasso <- data.frame(
alpha = 1,
lambda = 10^seq(3, -3, length = 100)
)
# Start parallel computing
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Fit lasso models
lasso_models <- model_datasets %>%
select(dataset, x_train, y_train) %>%
mutate(
fit_lasso = map2(x_train, y_train, cross_fit, tuning = tune_lasso),
) %>%
unnest(fit_lasso) %>%
arrange_own()
# Stop parallel computing
stopCluster(cl)
registerDoSEQ()
# Extract best models
best_lasso <- lasso_models %>%
group_by(dataset) %>%
slice(1) %>%
select(dataset, lambda, fit) %>%
ungroup()
# Extract performance metrics information
lasso_models <- lasso_models %>%
select(-c(x_train, y_train, alpha, fit))
# Inspect performance metrics statistics for unusual values
lasso_models %>%
group_by(dataset) %>%
get_summary_stats(-lambda) %>%
kable2()
A high number of models fitted to the transformed datasets lack R-square estimation.
# Summarize only the models with missing R-squared values
lasso_models %>%
filter(is.na(Rsquared)) %>%
get_summary_stats() %>%
kable2()
As seen in the table above, the lack of R-squared estimation concerned models with higher lambda values. This is probably associated with the nature of the outcome after log-transformation which largely shrank its original value.
# Extract the best 10 models
best_lasso10 <- lasso_models %>%
slice(1:10)
# Arrange lasso models in terms of the chosen performance metric
lasso_models %>%
kable2()
Cross-validation revealed that the best 10 linear models explained between 0.3429291 0.345523 proportion of the total variance, with lambda values being between 0.0020092 and 657.9332247 predictors.
# Plot scatterplots of number of variables against R-squared
lasso_models %>%
group_by(dataset) %>%
mutate(
best = Rsquared == max(Rsquared, na.rm = T),
lambda = log(lambda)
) %>%
ungroup() %>%
drop_na(Rsquared) %>%
ggplot(aes(lambda, Rsquared, color = dataset, alpha = best, shape = best)) +
geom_point() +
facet_wrap(~dataset, scales = "free_x")
Models fitted to the untrasformed datasets resulted in similar performance across all lambda values with models being slighlty better at the higher lambda values. The latter was somewhat more pronounced in the case of the reduced dataset.
In contrast, lasso models fitted to the datasets with log-transformed
outcome performed better at smaller values of lambda with a noticeable
and consistent decrease starting with a log(lambda) value
of -4.
lasso_models %>%
drop_na(Rsquared) %>%
select(dataset, lambda, RMSE, MAE) %>%
pivot_longer(c( RMSE, MAE), names_to = "Metric", values_to = "Performance") %>%
group_by(dataset, Metric) %>%
mutate(
best = Performance == min(Performance),
lambda = log(lambda)
) %>%
ungroup() %>%
ggplot(aes(lambda, Performance, color = dataset, alpha = best, shape = best)) +
geom_point() +
facet_grid(dataset ~ Metric, scales = "free")
# extract coefficients of only the best lasso models
best_lasso <- best_lasso %>%
mutate(
coefs = map2(fit, lambda, ~coef(.x, s = .y)),
coefs = map(coefs, ~as.data.frame.matrix(.) %>% rownames_to_column() )
)
# Fit linear models using only those predictors which coefficients weren't reduced to zero
best_lasso_reduced <- best_lasso %>%
unnest(coefs) %>%
filter(s1 != 0, rowname != "(Intercept)") %>%
select(dataset, rowname) %>%
nest(.by = dataset) %>%
mutate(
x_vars = map_chr(data, ~str_c(unlist(.), collapse = " + ")),
formula = str_c("salary_adj ~ ", x_vars),
formula = map(formula, as.formula)
) %>%
select(dataset, formula) %>%
left_join(model_datasets) %>%
ungroup() %>%
mutate(
fit = map2(formula, train_dummy, ~lm(.x, .y))
) %>%
select(dataset, test_dummy, fit)
# Obtain model diagnostic statistics and performance metrics
best_lasso_reduced2 <- best_lasso_reduced %>%
mutate(
RMSE = map2_dbl(fit, test_dummy, rmse),
Rsquared = map2_dbl(fit, test_dummy, rsquare),
MAE = map2_dbl(fit, test_dummy, mae),
map_df(fit, glance),
coefs = map(fit, ~lm.beta::lm.beta(.) %>% tidy()),
vif = map(fit, vif),
vif_names = map(vif, names),
augmented = map(fit, augment)
) %>%
arrange_own()
# Extract coefficients from the updated best lasso models after reduction of zero-coefficient predictors
lasso_coefs <- best_lasso_reduced2 %>%
select(dataset, coefs) %>%
unnest() %>%
group_by(dataset) %>%
arrange(desc(std_estimate)) %>%
ungroup() %>%
mutate(across(where(is.double), ~round(., 3))) %>%
mutate(
std_estimate2 = if_else(
p.value <= 0.05,
str_c(std_estimate, "*"),
as.character(std_estimate)
),
significant = p.value <= 0.05
)
# Plot standardized coefficients of the predictors for the four best linear models together
lasso_coefs %>%
ggplot(aes(std_estimate, term, color = dataset, alpha = significant)) +
geom_point(position = position_jitter(width = 0, height = 0.3)) +
coord_cartesian(xlim = c(
-abs(max(lasso_coefs$std_estimate, na.rm = T)),
abs(max(lasso_coefs$std_estimate, na.rm = T)))
) +
geom_vline(xintercept = 0)
# Extract augmented statistics
best_lasso_augmented <- best_lasso_reduced2 %>%
select(dataset, augmented) %>%
unnest(augmented)
# Plot the histograms of the models' residuals
best_lasso_augmented %>%
ggplot(aes(.std.resid)) +
geom_histogram() +
facet_wrap(~dataset)
The histograms indicate that the distributions of the residuals of all models roughly conformed to the normal distribution.
# Obtain Q-Q plots of the models' residuals
best_lasso_augmented %>%
ggplot(aes(sample = .std.resid)) +
geom_qq() +
geom_qq_line() +
facet_wrap(~dataset)
Based on the Q-Q plots, it can be observed that the models fitted on the untransformed salary variable did not deviate significantly from the normal distribution.
The case was somewhat different for the models fitted on the logarithmically transformed dependent variable, where moderate deviations were observed at the edges.
# Plot scatterplots of the predicted values against the residuals
best_lasso_augmented %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
geom_smooth(se = F) +
facet_wrap(~dataset, scales = "free")
All models suffered from heteroscedasticity, although this was less prounced in the case of those which were fitted on the logarithmically transformed outcome.
best_lasso_vif <- best_lasso_reduced2 %>%
select(dataset, vif, vif_names) %>%
unnest()
# Plot VIF of the best lasso models
best_lasso_vif %>%
unnest() %>%
ggplot(aes(vif, vif_names, color = dataset)) +
geom_point(position = position_jitter(height = 0.3))
Assuming the criterion of VIF > 4, All models were reported to have some degree of multicollinearity, with the field goals predictor achieving the highest multicollienarity in each of the models.
# Add conditional variables indicating influential and outlier values
best_lasso_augmented <- best_lasso_augmented %>%
mutate(Cook_high = if_else(.cooksd >= 1, T, F))
# Plot cook's distance values
best_lasso_augmented %>%
ggplot(aes(dataset, .cooksd)) +
geom_jitter()
No influential values were observed.
# Plot standardized residuals
best_lasso_augmented %>%
ggplot(aes(dataset, .std.resid)) +
geom_jitter()
No observations with a standardized residual greater than |x| > 3 were found.
# Set seed
set.seed(123)
# Set up tuning grid for tree models
tune_tree <- data.frame(cp = seq(0.001, 0.1, 0.001) )
# Obtain tree models
tree_models <- model_datasets %>%
select(dataset, x_train, y_train) %>%
mutate(
fit_tree = map2(
x_train,
y_train,
cross_fit,
tuning = tune_tree,
method = "rpart"
))
# Arrange models in terms of RMSE
tree_models2 <- tree_models %>%
select(-contains("train")) %>%
unnest(fit_tree) %>%
arrange(RMSE)
# Extract best tree models
best_tree <- tree_models2 %>%
group_by(dataset) %>%
slice(1) %>%
select(dataset, fit) %>%
ungroup()
# Arrange tree models from best to worst
tree_models2 %>%
select(-fit) %>%
arrange_own() %>%
kable2()
# Plot R-squared of the models
tree_models2 %>%
select(-fit) %>%
group_by(dataset) %>%
mutate(best = Rsquared == max(Rsquared, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(cp, Rsquared, color = dataset, alpha = best)) +
geom_point() +
facet_wrap(~dataset, scales = "free")
Overall, regardless of the dataset category, smaller values of complexity parameter were associated with a better model fit. Therefore, allowing for more complex tree models with less pruning resulted in better predictions of the salary.
tree_models2 %>%
select(-fit) %>%
pivot_longer(
cols = c(RMSE, MAE),
values_to = "Performance",
names_to = "Metric"
) %>%
group_by(dataset, Metric) %>%
mutate(best = Performance == min(Performance, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(cp, Performance, color = dataset, alpha = best, shape = best)) +
geom_point() +
facet_grid(dataset ~ Metric, scales = "free")
Judging by RMSE, models achieved the best fit when the complexity was at a level between 0.025 and 0.05.
Analysis of the MAE results indicate a stronger difference in model optimization between different datasets, with the models fitted to the datasets with a log-transformed outcome showing preference for higher cp values, above around 0.04.
# Function to extract predictors and their importance from the final tree
rpart_predictors <- function(fit) {
# Extract names of the fitted predictors
predictors <- fit$frame$var %>%
keep(~. != "<leaf>")
fit$variable.importance %>%
keep_at(~. %in% predictors)
}
# Plot predictors importance
best_tree %>%
mutate(
importance = map(fit, rpart_predictors),
importance_names = map(importance, names)
) %>%
unnest(cols = -fit) %>%
ggplot(aes(y = importance_names, x = importance, color = dataset)) +
geom_point(position = position_jitter(width = 0.2, height = 0.2))
Number of Games Started was the only predictor that was used as a split in all four tree models and it was also associated with the highest importance in the models fitted to the untransformed datasets.
# Move best tree fits into a list
best_trees_list <- best_tree$fit %>%
set_names(best_tree$dataset)
rpart.plot(best_trees_list$full_original, digits = -3)
The optimal tree model among those fitted to the full untrasfromed dataset predicted the highest salary for WNBA players who had at least 11 Games Started, more than 0.5 Two Point Field Goals percentage and played in 2018. Having lass than 11 Games started was associated with the lowest salary.
rpart.plot(best_trees_list$reduced_original, digits = -3)
The best among models fitted to the reduced untransformed dataset revealed that only number of Games Started was predictive of salary with players having at least 11 Games Started having a higher salary.
rpart.plot(best_trees_list$full_log, digits = -5)
rpart.plot(best_trees_list$reduced_log, digits = -5)
Similar to the reduced untransformed dataset, the optimal model fitted to the reduced dataset with a log-transformed outcome revealed only one predictor: players with at least 11 games started were associated with a higher salary.
Overall, the tree models performed noticeably worse in comparison to the linear models.
# Tuning grid for random forest
tuning_forest <- data.frame(mtry = seq(2, ncol(model_full), 2))
# Set seed
set.seed(123)
# Start parallel computing
cl <- makeCluster(num_cores)
registerDoParallel(cl)
model_datasets$x_train[[1]]
# Fit random forest models
forest_models <- model_datasets %>%
expand_grid(ntree = c(100, 150, 200, 250, 300, 400, 500, 800, 1000)) %>%
select(dataset, x_train, y_train, ntree) %>%
mutate(fit_forest = pmap(
list(x_data = x_train, y_data = y_train, ... = ntree),
cross_fit,
tuning = tuning_forest,
method = "rf")
)
# forest_models2 <- forest_models
# Stop parallel computing
stopCluster(cl)
registerDoSEQ()
# Export random forest object to avoid long processing time
save(forest_models, file = "forest_models.RData")
load("forest_models.RData")
# Extract best forest models for each dataset type
best_forest <- forest_models %>%
unnest(fit_forest) %>%
group_by(dataset) %>%
arrange_own() %>%
select(dataset, fit) %>%
slice(1) %>%
ungroup()
# Extract model performance metrics by ntree and mtry
forest_fitted <- forest_models %>%
unnest(fit_forest) %>%
select(dataset, ntree, mtry, RMSE, Rsquared, MAE) %>%
arrange_own()
# Best 10 forest models
best_forest10 <- forest_fitted %>%
slice(1:10)
# Show all models
forest_fitted %>%
kable2()
Cross-validation revealed that the best 10 random forest models explained between 0.256804 0.2705142 proportion of the total variance, with mtry being between 12 and 26 predictors.
Except for one model, the preferable number of trees for growing was between 100 and 150.
The analysis also revealed that the best models were those fitted either to the reduced untransformed data or the full dataset with a log-transformed outcome.
# Add information about best fit for each model type based on R2
forest_rsquared <- forest_fitted %>%
group_by(dataset) %>%
mutate(best = Rsquared == max(Rsquared)) %>%
ungroup()
# Set common options for heatmaps
heatmap_aesthetics <- list(
scale_fill_viridis_c(option = "D", direction = 1)
)
# Set common options for heatmaps
heatmap_aesthetics2 <- list(
scale_fill_viridis_c(option = "D", direction = -1)
)
# Compare tuning parameters in terms of the R-squared on a heatmap
forest_rsquared %>%
ggplot(aes(factor(mtry), factor(ntree))) +
geom_tile(aes(fill = Rsquared)) +
geom_point(data = filter(forest_rsquared, best), color = dot_color) +
facet_wrap(~dataset, scales = "free") +
labs(y = "Number of trees to grow",x = "Number of variables randomly sampled") +
theme(legend.position = "top") +
heatmap_aesthetics
Overall, models with higher mtry values were associated
with a higher R-squared. A more pronounced difference was observed for
the number of trees parameter, with a higher number of trees leading to
better performance in the full untransformed dataset, while, in
contrast, a smaller number of trees was more effective for the reduced
untransformed dataset.
# Add information about best fit for each model type based on R2
forest_performance <- forest_fitted %>%
pivot_longer(cols = c(RMSE, MAE), names_to = "Metric", values_to = "Performance") %>%
group_by(dataset, Metric) %>%
mutate(
best = Performance == min(Performance),
transformed = if_else(str_detect(dataset, "log"), "Untransformed", "Transformed")
) %>%
ungroup() %>%
nest(.by = c(Metric, transformed))
# Create list for all plots
plots_forest <- list()
# Creat heatmaps for each metric and separately for transformed and untrasformed datasets
for (i in 1:nrow(forest_performance)) {
plots_forest[[i]] <- forest_performance$data[[i]] %>%
# filter(str_detect(dataset, "original"), Metric == "RMSE") %>%
ggplot(aes(factor(mtry), factor(ntree))) +
geom_tile(aes(fill = Performance)) +
geom_point(data = filter(forest_performance$data[[i]], best), color = "red") +
facet_wrap(~dataset, scales = "free") +
labs(
y = "Number of trees to grow",
x = "Number of variables randomly sampled",
title = str_c(
"Comparison of ", forest_performance$transformed[[i]], " models in terms of ", forest_performance$Metric[[i]]
)
) +
heatmap_aesthetics2
}
# Compare transformed models in terms of RMSE and MAE
cowplot::plot_grid(plots_forest[[1]], plots_forest[[2]], nrow = 2)
When compared in terms of RMSE and MAE, random forest models fitted
to the untransformed datasets revealed a similar pattern of tuning
parameter preference to the R-squared comparison. However, the analysis
of mean absolute error identified a different optimal model for the
reduced untransformed dataset, which was associated with a higher
mtry value.
# Compare transformed models in terms of RMSE and MAE
cowplot::plot_grid(plots_forest[[3]], plots_forest[[4]], nrow = 2)
The pattern of tuning parameter preference for the transformed dataset
model fitting was very similar across all three performance metrics,
with the same tuning parameters identified for the optimal models.
# Extract predictors' importance for each model
best_forest2 <- best_forest %>%
mutate(
importance_purity = map(fit, ~importance(.) %>% as.data.frame() %>% rownames_to_column())
)
# Plot predictors importance values for the untransformed datasets
best_forest2 %>%
select(dataset, importance_purity) %>%
unnest() %>%
filter(str_detect(dataset, "original")) %>%
ggplot(aes(y = rowname, x = IncNodePurity, color = dataset)) +
geom_point()
Interestingly, the random forest models revealed
Games and
Games Started as the most important predictors. Less
important, but still noticeable predictors, were
Field Goals, Personal Fouls,
Points, and 2-Point Field Goals.
# Plot predictors importance values for the transformed datasets
best_forest2 %>%
select(dataset, importance_purity) %>%
unnest() %>%
filter(str_detect(dataset, "log")) %>%
ggplot(aes(y = rowname, x = IncNodePurity, color = dataset)) +
geom_point()
# Plot partial predictions of the selected variables
best_forest2 %>%
unnest() %>%
mutate(salary = if_else(str_detect(dataset, "log"), exp(salary), salary)) %>%
ggplot(aes(predictor, salary, color = dataset, group = dataset)) +
geom_line() +
facet_wrap(~predictor_name, scales = "free") +
theme(legend.position = "top")
Across all datasets, Field Goals, Games Started, Points, and Two Points Field Goals were positively associated with Salary, while number of Games and Personal Fouls were negative predictors of Salary.
Salary started to increase largely with more than 50 Field Goals with the increase levelling at around 150 Field Goals. Similar trends were observed for for Games Started for numbers between 10 and 30 and, not as sharp, Points for values between 200 and 500.
The increase in salary in response to increase in number of Two Points Field Goals was more drastic, with a high increase of the former between 75 and 90 Two Points Field Goals with values above 100 up to 269 showing no further improvement. This suggests that there is a narrow effect window strongly differenting players in terms of their salary.
Number of Games was associated with a steady decrease in salary until leveling at around a value of 40.
Personal Fouls displayed a more discrete relationship with Salary with values around 10 and then 45 marking two large drops in Salary.
# Set seed
set.seed(123)
# Tuning grid for boosted trees
tune_boosted <- expand_grid(
nrounds = c(25, 50, 100, 200),
max_depth = seq(1, 9, 2),
eta = c(0.05, 0.1, 0.15, 0.2),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
# Define a quite version of the cross_fit function to capture warnings
quiet_cross_fit <- quietly(cross_fit)
# Set up parallel computing
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Train boosted tree models
boosted_models <- model_datasets %>%
select(dataset, x_train, y_train) %>%
mutate(
fit_boost = map2(x_train, y_train, quiet_cross_fit, tuning = tune_boosted, method = "xgbTree")
)
# Stop parallem computing
stopCluster(cl)
registerDoSEQ()
# Extract strings with warning information
boosted_warnings <- boosted_models %>%
select(dataset, fit_boost) %>%
unnest(fit_boost) %>%
filter(map_lgl(fit_boost, is.character)) %>%
unnest(fit_boost) %>%
filter(fit_boost != "")
boosted_warnings %>%
kable2()
# Extract boosted model results statistics
boosted_models <- boosted_models %>%
select(dataset, fit_boost) %>%
unnest(fit_boost) %>%
filter(map_lgl(fit_boost, is.tibble))
# Extract performance metrics for all boosted models
boosted_models2 <- boosted_models %>%
unnest(fit_boost) %>%
select(dataset, eta, max_depth, nrounds, RMSE, MAE, Rsquared) %>%
arrange_own()
# Extract best 10 boosted models for reference
boosted_best10 <- boosted_models2 %>%
slice(1:10)
# Extract best boosted models
best_boosted <- boosted_models %>%
unnest(fit_boost) %>%
group_by(dataset) %>%
arrange_own() %>%
select(dataset, fit) %>%
slice(1) %>%
ungroup()
# Inspect performance metrics among the models that had missing values in resampled performance measures
boosted_models2 %>%
filter(str_detect(dataset, "log")) %>%
group_by(dataset) %>%
get_summary_stats(RMSE, MAE, Rsquared)
# A tibble: 6 × 14
dataset variable n min max median q1 q3 iqr mad mean sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 full_log RMSE 80 0.373 3.14 0.383 0.378 0.402 0.023 0.008 0.626 0.679
2 full_log MAE 80 0.304 3.12 0.317 0.313 0.335 0.022 0.009 0.56 0.687
3 full_log Rsquared 75 0.186 0.25 0.224 0.209 0.231 0.022 0.018 0.22 0.017
4 reduced… RMSE 80 0.369 3.14 0.396 0.385 0.411 0.026 0.016 0.634 0.677
5 reduced… MAE 80 0.307 3.12 0.327 0.321 0.345 0.024 0.013 0.567 0.685
6 reduced… Rsquared 80 0.001 0.244 0.195 0.18 0.212 0.032 0.025 0.186 0.052
# ℹ 2 more variables: se <dbl>, ci <dbl>
Five of the models fitted to the full and transformed dataset lacked R-squared estimation. Some models fitted to the reduced transformed dataset resulted in very small R-squared values.
boosted_models2 %>%
filter(is.na(Rsquared) | Rsquared < 0.01)
# A tibble: 10 × 7
dataset eta max_depth nrounds RMSE MAE Rsquared
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 reduced_log 0.05 1 25 3.14 3.12 0.00128
2 reduced_log 0.05 3 25 3.14 3.12 0.00128
3 reduced_log 0.05 5 25 3.14 3.12 0.00128
4 reduced_log 0.05 7 25 3.14 3.12 0.00128
5 reduced_log 0.05 9 25 3.14 3.12 0.00128
6 full_log 0.05 1 25 3.14 3.12 NaN
7 full_log 0.05 3 25 3.14 3.12 NaN
8 full_log 0.05 5 25 3.14 3.12 NaN
9 full_log 0.05 7 25 3.14 3.12 NaN
10 full_log 0.05 9 25 3.14 3.12 NaN
The reported missing values in resampled performance measures seems to be associated with the models fitted to the dataset with the log-transformed outcome that relied on a small learning rate and a small number of boosting iterations. Given this small size of the two tuning parameters and that the outcome was logarithmized, the models probably did not have enough iterations to converge properly, i.e. they failed to learn meaningful patterns.
# Show all boosted models in a table
boosted_models2 %>%
kable2()
The best 10 boosted tree models, as revealed by cross-validation, explained between 0.2423929 0.2502521 proportion of the total variance, with mtry being between 12 and 26 predictors.
Most of the best 10 models were those fitted to the full dataset with a log-transformed outcome.
# Combine eta and nround columns into one
boosted_models3 <- boosted_models2 %>%
mutate(
eta_nround = interaction(eta, nrounds)
) %>%
filter(!(is.na(Rsquared) | Rsquared < 0.01))
# Add information about the optimal model based on R-squared
boosted_rsquared <- boosted_models3 %>%
group_by(dataset) %>%
mutate(best = Rsquared == max(Rsquared, na.rm = T)) %>%
ungroup()
# Compare tuning parameters in terms of R-squared on a heatmap
boosted_rsquared %>%
ggplot(aes(factor(max_depth), eta_nround)) +
geom_tile(aes(fill = Rsquared)) +
geom_point(data = filter(boosted_rsquared, best), color = "red") +
facet_wrap(~dataset, scales = "free") +
labs(y = "Number of trees to grow",x = "Number of variables randomly sampled") +
heatmap_aesthetics
# Add information about the optimal model based on R-squared
boosted_performance <- boosted_models3 %>%
pivot_longer(cols = c(RMSE, MAE), names_to = "Metric", values_to = "Performance") %>%
group_by(dataset, Metric) %>%
mutate(
best = Performance == min(Performance, na.rm = T),
transformed = if_else(str_detect(dataset, "log"), "Untransformed", "Transformed")
) %>%
ungroup() %>%
nest(.by = c(Metric, transformed))
# Create list for all plots
plots_boosted <- list()
# Creat heatmaps for each metric and separately for transformed and untrasformed datasets
for (i in 1:nrow(boosted_performance)) {
plots_boosted[[i]] <- boosted_performance$data[[i]] %>%
# filter(str_detect(dataset, "original"), Metric == "RMSE") %>%
ggplot(aes(factor(max_depth), eta_nround)) +
geom_tile(aes(fill = Performance)) +
geom_point(data = filter(boosted_performance$data[[i]], best), color = "red") +
facet_wrap(~dataset, scales = "free") +
labs(
y = "Number of trees to grow",
x = "Number of variables randomly sampled",
title = str_c(
"Comparison of ", boosted_performance$transformed[[i]], " models in terms of ", boosted_performance$Metric[[i]]
)
) +
heatmap_aesthetics2
}
cowplot::plot_grid(plots_boosted[[3]], plots_boosted[[4]], nrow = 2)
cowplot::plot_grid(plots_boosted[[1]], plots_boosted[[2]], nrow = 2)
# Plot importance of all predictors for the best model for each dataset type
best_boosted %>%
mutate(importance = map(fit, ~xgb.importance(model = .x))) %>%
select(-fit) %>%
unnest(importance) %>%
pivot_longer(-c(dataset, Feature), names_to = "type") %>%
ggplot(aes(y = Feature, x = value, color = dataset)) +
geom_point() +
facet_wrap(~type, scales = "free_x")
set.seed(123)
best_boosted %>%
left_join(model_datasets) %>%
mutate(
)
# A tibble: 4 × 11
dataset fit whole train test train_dummy test_dummy x_train
<fct> <list> <list> <list> <list> <list> <list> <list>
1 full_or… <xgb.Bstr> <tibble> <tibble> <tibble> <tibble> <tibble> <dbl[…]>
2 reduced… <xgb.Bstr> <tibble> <tibble> <tibble> <tibble> <tibble> <dbl[…]>
3 full_log <xgb.Bstr> <tibble> <tibble> <tibble> <tibble> <tibble> <dbl[…]>
4 reduced… <xgb.Bstr> <tibble> <tibble> <tibble> <tibble> <tibble> <dbl[…]>
# ℹ 3 more variables: y_train <list>, x_test <list>, y_test <list>
# Enable parallel computing
plan(multisession)
# Extract partial predictions for most important predictors
best_boosted2 <- best_boosted %>%
left_join(model_datasets) %>%
mutate(
plots_Games = future_map2(fit, x_train, ~pdp::partial(.x, train = .y, pred.var = "Games")),
plots_GS = future_map2(fit, x_train, ~pdp::partial(.x, train = .y, pred.var = "GS")),
plots_FG = future_map2(fit, x_train, ~pdp::partial(.x, train = .y, pred.var = "FG")),
plots_Two_P = future_map2(fit, x_train, ~pdp::partial(.x, train = .y, pred.var = "Two_P")),
plots_PF = future_map2(fit, x_train, ~pdp::partial(.x, train = .y, pred.var = "PF"))
) %>%
select(dataset, starts_with("plots")) %>%
pivot_longer(-dataset, names_to = "predictor_name", values_to = "partial_predictions")
plan(sequential)
best_boosted2 %>%
mutate(
partial_predictions = map(partial_predictions, ~set_names(.x, nm = c("predictor", "salary")))
) %>%
unnest(partial_predictions) %>%
mutate(
predictor_name = str_replace(predictor_name, "plots_", ""),
salary = if_else(str_detect(dataset, "log"), exp(salary), salary)
) %>%
ggplot(aes(predictor, salary, color = dataset, group = dataset)) +
geom_line() +
facet_wrap(~predictor_name, scales = "free") +
theme(legend.position = "top")
The more the field Goals, Games Started and Two points goals, the higher was the salary, generally speaking.
The optimal number of Field Goals and Two Point Goals were around 130 and 90 respectively, with higher numbers not increasing the salary anymore. In the case of number of Games Started, the optimal value was around 30.
In contrast, the more the Games and Personal Fouls, the smaller the salary. Salary strongly dropped with around 10 Personal Fouls and remained at a similar level until another drop with more than 50 fouls. In the case of number of Games, salary started consistently dropping after above more than 10 games.
round_number <- 3
# Obtain Mean-squared error
rmse_own <- function(predicted, true_y) {
# Obtain mse
rmse <- sqrt(mean((true_y - predicted)^2))
rmse <- round(rmse, round_number)
return(rmse)
}
# Obtain Mean-median error
mae_own <- function(predicted, true_y) {
# Obtain mse
mae <- mean(abs(true_y - predicted))
mae <- round(mae, round_number)
return(mae)
}
# Obtain R-squared
rsquare_own <- function(predicted, true_y) {
# Obtain mse
r2 <- 1 - (sum((true_y - predicted) ^ 2) / sum((true_y - mean(true_y)) ^ 2))
r2 <- round(r2, round_number)
return(r2)
}
# Remove unnecesary columns for best models mering
best_lasso_reduced <- best_lasso_reduced %>%
select(-test_dummy)
# Compare the best sets of models for all model types on the test dataset
best_tested <- best_boosted %>%
add_row(best_forest) %>%
add_row(best_lasso_reduced) %>%
add_row(best_tree) %>%
add_row(best_lm) %>%
left_join(model_datasets) %>%
mutate(
fit_class = map_chr(fit, class),
test = if_else(fit_class %in% c("lm", "rpart"), test_dummy, x_test)
) %>%
mutate(
predicted = map2(fit, test, predict),
RMSE = map2_dbl(predicted, y_test, rmse_own),
MAE = map2_dbl(predicted, y_test, mae_own),
Rsquared = map2_dbl(predicted, y_test, rsquare_own)
) %>%
arrange_own()
# Extract only performance metrics
best_tested2 <- best_tested %>%
select(dataset, fit_class, RMSE, MAE, Rsquared)
# Show the model comparison in a table
best_tested2 %>%
select(dataset, fit_class, RMSE, MAE, Rsquared) %>%
kable2()
best_tested2 %>%
ggplot(aes(dataset, fit_class, fill = Rsquared)) +
geom_tile()
The role of COVID
Limitations - The salary data was not available for all players, especially those with lower wages
Może dodać usuwanie obiektów, ale też i z ciekawości, wykorzystaną pamieć w danym momencie.